Skip to main content
Glama

Edit-MCP

icu.rs45.6 kB
// Copyright (c) Microsoft Corporation. // Licensed under the MIT License. //! Bindings to the ICU library. use std::cmp::Ordering; use std::ffi::CStr; use std::mem; use std::mem::MaybeUninit; use std::ops::Range; use std::ptr::{null, null_mut}; use crate::arena::{Arena, ArenaString, scratch_arena}; use crate::buffer::TextBuffer; use crate::unicode::Utf8Chars; use crate::{apperr, arena_format, sys}; #[derive(Clone, Copy)] pub struct Encoding { pub label: &'static str, pub canonical: &'static str, } pub struct Encodings { pub preferred: &'static [Encoding], pub all: &'static [Encoding], } static mut ENCODINGS: Encodings = Encodings { preferred: &[], all: &[] }; /// Returns a list of encodings ICU supports. pub fn get_available_encodings() -> &'static Encodings { // OnceCell for people that want to put it into a static. #[allow(static_mut_refs)] unsafe { if ENCODINGS.all.is_empty() { let scratch = scratch_arena(None); let mut preferred = Vec::new_in(&*scratch); let mut alternative = Vec::new_in(&*scratch); // These encodings are always available. preferred.push(Encoding { label: "UTF-8", canonical: "UTF-8" }); preferred.push(Encoding { label: "UTF-8 BOM", canonical: "UTF-8 BOM" }); if let Ok(f) = init_if_needed() { let mut n = 0; loop { let name = (f.ucnv_getAvailableName)(n); if name.is_null() { break; } n += 1; let name = CStr::from_ptr(name).to_str().unwrap_unchecked(); // We have already pushed UTF-8 above and can skip it. // There is no need to filter UTF-8 BOM here, // since ICU does not distinguish it from UTF-8. if name.is_empty() || name == "UTF-8" { continue; } let mut status = icu_ffi::U_ZERO_ERROR; let mime = (f.ucnv_getStandardName)( name.as_ptr(), c"MIME".as_ptr() as *const _, &mut status, ); if !mime.is_null() && status.is_success() { let mime = CStr::from_ptr(mime).to_str().unwrap_unchecked(); preferred.push(Encoding { label: mime, canonical: name }); } else { alternative.push(Encoding { label: name, canonical: name }); } } } let preferred_len = preferred.len(); // Combine the preferred and alternative encodings into a single list. let mut all = Vec::with_capacity(preferred.len() + alternative.len()); all.extend(preferred); all.extend(alternative); let all = all.leak(); ENCODINGS.preferred = &all[..preferred_len]; ENCODINGS.all = &all[..]; } &ENCODINGS } } /// Formats the given ICU error code into a human-readable string. pub fn apperr_format(f: &mut std::fmt::Formatter<'_>, code: u32) -> std::fmt::Result { fn format(code: u32) -> &'static str { let Ok(f) = init_if_needed() else { return ""; }; let status = icu_ffi::UErrorCode::new(code); let ptr = unsafe { (f.u_errorName)(status) }; if ptr.is_null() { return ""; } let str = unsafe { CStr::from_ptr(ptr) }; str.to_str().unwrap_or("") } let msg = format(code); if !msg.is_empty() { write!(f, "ICU Error: {msg}") } else { write!(f, "ICU Error: {code:#08x}") } } /// Converts between two encodings using ICU. pub struct Converter<'pivot> { source: *mut icu_ffi::UConverter, target: *mut icu_ffi::UConverter, pivot_buffer: &'pivot mut [MaybeUninit<u16>], pivot_source: *mut u16, pivot_target: *mut u16, reset: bool, } impl Drop for Converter<'_> { fn drop(&mut self) { let f = assume_loaded(); unsafe { (f.ucnv_close)(self.source) }; unsafe { (f.ucnv_close)(self.target) }; } } impl<'pivot> Converter<'pivot> { /// Constructs a new `Converter` instance. /// /// # Parameters /// /// * `pivot_buffer`: A buffer used to cache partial conversions. /// Don't make it too small. /// * `source_encoding`: The source encoding name (e.g., "UTF-8"). /// * `target_encoding`: The target encoding name (e.g., "UTF-16"). pub fn new( pivot_buffer: &'pivot mut [MaybeUninit<u16>], source_encoding: &str, target_encoding: &str, ) -> apperr::Result<Self> { let f = init_if_needed()?; let arena = scratch_arena(None); let source_encoding = Self::append_nul(&arena, source_encoding); let target_encoding = Self::append_nul(&arena, target_encoding); let mut status = icu_ffi::U_ZERO_ERROR; let source = unsafe { (f.ucnv_open)(source_encoding.as_ptr(), &mut status) }; let target = unsafe { (f.ucnv_open)(target_encoding.as_ptr(), &mut status) }; if status.is_failure() { if !source.is_null() { unsafe { (f.ucnv_close)(source) }; } if !target.is_null() { unsafe { (f.ucnv_close)(target) }; } return Err(status.as_error()); } let pivot_source = pivot_buffer.as_mut_ptr() as *mut u16; let pivot_target = unsafe { pivot_source.add(pivot_buffer.len()) }; Ok(Self { source, target, pivot_buffer, pivot_source, pivot_target, reset: true }) } fn append_nul<'a>(arena: &'a Arena, input: &str) -> ArenaString<'a> { arena_format!(arena, "{}\0", input) } /// Performs one step of the encoding conversion. /// /// # Parameters /// /// * `input`: The input buffer to convert from. /// It should be in the `source_encoding` that was previously specified. /// * `output`: The output buffer to convert to. /// It should be in the `target_encoding` that was previously specified. /// /// # Returns /// /// A tuple containing: /// 1. The number of bytes read from the input buffer. /// 2. The number of bytes written to the output buffer. pub fn convert( &mut self, input: &[u8], output: &mut [MaybeUninit<u8>], ) -> apperr::Result<(usize, usize)> { let f = assume_loaded(); let input_beg = input.as_ptr(); let input_end = unsafe { input_beg.add(input.len()) }; let mut input_ptr = input_beg; let output_beg = output.as_mut_ptr() as *mut u8; let output_end = unsafe { output_beg.add(output.len()) }; let mut output_ptr = output_beg; let pivot_beg = self.pivot_buffer.as_mut_ptr() as *mut u16; let pivot_end = unsafe { pivot_beg.add(self.pivot_buffer.len()) }; let flush = input.is_empty(); let mut status = icu_ffi::U_ZERO_ERROR; unsafe { (f.ucnv_convertEx)( /* target_cnv */ self.target, /* source_cnv */ self.source, /* target */ &mut output_ptr, /* target_limit */ output_end, /* source */ &mut input_ptr, /* source_limit */ input_end, /* pivot_start */ pivot_beg, /* pivot_source */ &mut self.pivot_source, /* pivot_target */ &mut self.pivot_target, /* pivot_limit */ pivot_end, /* reset */ self.reset, /* flush */ flush, /* status */ &mut status, ); } self.reset = false; if status.is_failure() && status != icu_ffi::U_BUFFER_OVERFLOW_ERROR { return Err(status.as_error()); } let input_advance = unsafe { input_ptr.offset_from(input_beg) as usize }; let output_advance = unsafe { output_ptr.offset_from(output_beg) as usize }; Ok((input_advance, output_advance)) } } // In benchmarking, I found that the performance does not really change much by changing this value. // I picked 64 because it seemed like a reasonable lower bound. const CACHE_SIZE: usize = 64; /// Caches a chunk of TextBuffer contents (UTF-8) in UTF-16 format. struct Cache { /// The translated text. Contains [`Cache::utf16_len`]-many valid items. utf16: [u16; CACHE_SIZE], /// For each character in [`Cache::utf16`] this stores the offset in the [`TextBuffer`], /// relative to the start offset stored in `native_beg`. /// This has the same length as [`Cache::utf16`]. utf16_to_utf8_offsets: [u16; CACHE_SIZE], /// `utf8_to_utf16_offsets[native_offset - native_beg]` will tell you which character in /// [`Cache::utf16`] maps to the given `native_offset` in the underlying [`TextBuffer`]. /// Contains `native_end - native_beg`-many valid items. utf8_to_utf16_offsets: [u16; CACHE_SIZE], /// The number of valid items in [`Cache::utf16`]. utf16_len: usize, /// Offset of the first non-ASCII character. /// Less than or equal to [`Cache::utf16_len`]. native_indexing_limit: usize, /// The range of UTF-8 text in the [`TextBuffer`] that this chunk covers. utf8_range: Range<usize>, } struct DoubleCache { cache: [Cache; 2], /// You can consider this a 1 bit index into `cache`. mru: bool, } /// A wrapper around ICU's `UText` struct. /// /// In our case its only purpose is to adapt a [`TextBuffer`] for ICU. /// /// # Safety /// /// Warning! No lifetime tracking is done here. /// I initially did it properly with a PhantomData marker for the TextBuffer /// lifetime, but it was a pain so now I don't. Not a big deal in our case. pub struct Text(&'static mut icu_ffi::UText); impl Drop for Text { fn drop(&mut self) { let f = assume_loaded(); unsafe { (f.utext_close)(self.0) }; } } impl Text { /// Constructs an ICU `UText` instance from a [`TextBuffer`]. /// /// # Safety /// /// The caller must ensure that the given [`TextBuffer`] /// outlives the returned `Text` instance. pub unsafe fn new(tb: &TextBuffer) -> apperr::Result<Self> { let f = init_if_needed()?; let mut status = icu_ffi::U_ZERO_ERROR; let ptr = unsafe { (f.utext_setup)(null_mut(), size_of::<DoubleCache>() as i32, &mut status) }; if status.is_failure() { return Err(status.as_error()); } const FUNCS: icu_ffi::UTextFuncs = icu_ffi::UTextFuncs { table_size: size_of::<icu_ffi::UTextFuncs>() as i32, reserved1: 0, reserved2: 0, reserved3: 0, clone: Some(utext_clone), native_length: Some(utext_native_length), access: Some(utext_access), extract: None, replace: None, copy: None, map_offset_to_native: Some(utext_map_offset_to_native), map_native_index_to_utf16: Some(utext_map_native_index_to_utf16), close: None, spare1: None, spare2: None, spare3: None, }; let ut = unsafe { &mut *ptr }; ut.p_funcs = &FUNCS; ut.context = tb as *const TextBuffer as *mut _; ut.a = tb.generation() as i64; // ICU unfortunately expects a `UText` instance to have valid contents after construction. utext_access(ut, 0, true); Ok(Self(ut)) } } fn text_buffer_from_utext<'a>(ut: &icu_ffi::UText) -> &'a TextBuffer { unsafe { &*(ut.context as *const TextBuffer) } } fn double_cache_from_utext<'a>(ut: &icu_ffi::UText) -> &'a mut DoubleCache { unsafe { &mut *(ut.p_extra as *mut DoubleCache) } } extern "C" fn utext_clone( dest: *mut icu_ffi::UText, src: &icu_ffi::UText, deep: bool, status: &mut icu_ffi::UErrorCode, ) -> *mut icu_ffi::UText { if status.is_failure() { return null_mut(); } if deep { *status = icu_ffi::U_UNSUPPORTED_ERROR; return null_mut(); } let f = assume_loaded(); let ut_ptr = unsafe { (f.utext_setup)(dest, size_of::<DoubleCache>() as i32, status) }; if status.is_failure() { return null_mut(); } unsafe { let ut = &mut *ut_ptr; let src_double_cache = double_cache_from_utext(src); let dst_double_cache = double_cache_from_utext(ut); let src_cache = &src_double_cache.cache[src_double_cache.mru as usize]; let dst_cache = &mut dst_double_cache.cache[dst_double_cache.mru as usize]; ut.provider_properties = src.provider_properties; ut.chunk_native_limit = src.chunk_native_limit; ut.native_indexing_limit = src.native_indexing_limit; ut.chunk_native_start = src.chunk_native_start; ut.chunk_offset = src.chunk_offset; ut.chunk_length = src.chunk_length; ut.chunk_contents = dst_cache.utf16.as_ptr(); ut.p_funcs = src.p_funcs; ut.context = src.context; ut.a = src.a; // I wonder if it would make sense to use a Cow here. But probably not. std::ptr::copy_nonoverlapping(src_cache, dst_cache, 1); } ut_ptr } extern "C" fn utext_native_length(ut: &mut icu_ffi::UText) -> i64 { let tb = text_buffer_from_utext(ut); tb.text_length() as i64 } extern "C" fn utext_access(ut: &mut icu_ffi::UText, native_index: i64, forward: bool) -> bool { if let Some(cache) = utext_access_impl(ut, native_index, forward) { let native_off = native_index as usize - cache.utf8_range.start; ut.chunk_contents = cache.utf16.as_ptr(); ut.chunk_length = cache.utf16_len as i32; ut.chunk_offset = cache.utf8_to_utf16_offsets[native_off] as i32; ut.chunk_native_start = cache.utf8_range.start as i64; ut.chunk_native_limit = cache.utf8_range.end as i64; ut.native_indexing_limit = cache.native_indexing_limit as i32; true } else { false } } fn utext_access_impl<'a>( ut: &mut icu_ffi::UText, native_index: i64, forward: bool, ) -> Option<&'a mut Cache> { let tb = text_buffer_from_utext(ut); let mut index_contained = native_index; if !forward { index_contained -= 1; } if index_contained < 0 || index_contained as usize >= tb.text_length() { return None; } let index_contained = index_contained as usize; let native_index = native_index as usize; let double_cache = double_cache_from_utext(ut); let dirty = ut.a != tb.generation() as i64; if dirty { // The text buffer contents have changed. // Invalidate both caches so that future calls don't mistakenly use them // when they enter the for loop in the else branch below (`dirty == false`). double_cache.cache[0].utf16_len = 0; double_cache.cache[1].utf16_len = 0; double_cache.cache[0].utf8_range = 0..0; double_cache.cache[1].utf8_range = 0..0; ut.a = tb.generation() as i64; } else { // Check if one of the caches already contains the requested range. for (i, cache) in double_cache.cache.iter_mut().enumerate() { if cache.utf8_range.contains(&index_contained) { double_cache.mru = i != 0; return Some(cache); } } } // Turn the least recently used cache into the most recently used one. let double_cache = double_cache_from_utext(ut); double_cache.mru = !double_cache.mru; let cache = &mut double_cache.cache[double_cache.mru as usize]; // In order to safely fit any UTF-8 character into our cache, // we must assume the worst case of a 4-byte long encoding. const UTF16_LEN_LIMIT: usize = CACHE_SIZE - 4; let utf8_len_limit; let native_start; if forward { utf8_len_limit = (tb.text_length() - native_index).min(UTF16_LEN_LIMIT); native_start = native_index; } else { // The worst case ratio for UTF-8 to UTF-16 is 1:1, when the text is ASCII. // This allows us to safely subtract the UTF-16 buffer size // and assume that whatever we read as UTF-8 will fit. // TODO: Test what happens if you have lots of invalid UTF-8 text blow up to U+FFFD. utf8_len_limit = native_index.min(UTF16_LEN_LIMIT); // Since simply subtracting an offset may end up in the middle of a codepoint sequence, // we must align the offset to the next codepoint boundary. // Here we skip trail bytes until we find a lead. let mut beg = native_index - utf8_len_limit; let chunk = tb.read_forward(beg); for &c in chunk { if c & 0b1100_0000 != 0b1000_0000 { break; } beg += 1; } native_start = beg; } // Translate the given range from UTF-8 to UTF-16. // NOTE: This code makes the assumption that the `native_index` is always // at UTF-8 codepoint boundaries which technically isn't guaranteed. let mut utf16_len = 0; let mut utf8_len = 0; let mut ascii_len = 0; 'outer: loop { let initial_utf8_len = utf8_len; let chunk = tb.read_forward(native_start + utf8_len); if chunk.is_empty() { break; } let mut it = Utf8Chars::new(chunk, 0); // If we've only seen ASCII so far we can fast-pass the UTF-16 translation, // because we can just widen from u8 -> u16. if utf16_len == ascii_len { let haystack = &chunk[..chunk.len().min(utf8_len_limit - ascii_len)]; // When it comes to performance, and the search space is small (which it is here), // it's always a good idea to keep the loops small and tight... let len = haystack.iter().position(|&c| c >= 0x80).unwrap_or(haystack.len()); // ...In this case it allows the compiler to vectorize this loop and double // the performance. Luckily, llvm doesn't unroll the loop, which is great, // because `len` will always be a relatively small number. for &c in &chunk[..len] { unsafe { *cache.utf16.get_unchecked_mut(ascii_len) = c as u16; *cache.utf16_to_utf8_offsets.get_unchecked_mut(ascii_len) = ascii_len as u16; *cache.utf8_to_utf16_offsets.get_unchecked_mut(ascii_len) = ascii_len as u16; } ascii_len += 1; } utf16_len += len; utf8_len += len; it.seek(len); if ascii_len >= UTF16_LEN_LIMIT { break; } } loop { let Some(c) = it.next() else { break; }; // Thanks to our `if utf16_len >= UTF16_LEN_LIMIT` check, // we can safely assume that this will fit. unsafe { let utf8_len_beg = utf8_len; let utf8_len_end = initial_utf8_len + it.offset(); while utf8_len < utf8_len_end { *cache.utf8_to_utf16_offsets.get_unchecked_mut(utf8_len) = utf16_len as u16; utf8_len += 1; } if c <= '\u{FFFF}' { *cache.utf16.get_unchecked_mut(utf16_len) = c as u16; *cache.utf16_to_utf8_offsets.get_unchecked_mut(utf16_len) = utf8_len_beg as u16; utf16_len += 1; } else { let c = c as u32 - 0x10000; let b = utf8_len_beg as u16; *cache.utf16.get_unchecked_mut(utf16_len) = (c >> 10) as u16 | 0xD800; *cache.utf16.get_unchecked_mut(utf16_len + 1) = (c & 0x3FF) as u16 | 0xDC00; *cache.utf16_to_utf8_offsets.get_unchecked_mut(utf16_len) = b; *cache.utf16_to_utf8_offsets.get_unchecked_mut(utf16_len + 1) = b; utf16_len += 2; } } if utf16_len >= UTF16_LEN_LIMIT || utf8_len >= utf8_len_limit { break 'outer; } } } // Allow for looking up past-the-end indices via // `utext_map_offset_to_native` and `utext_map_native_index_to_utf16`. cache.utf16_to_utf8_offsets[utf16_len] = utf8_len as u16; cache.utf8_to_utf16_offsets[utf8_len] = utf16_len as u16; let native_limit = native_start + utf8_len; cache.utf16_len = utf16_len; // If parts of the UTF-8 chunk are ASCII, we can tell ICU that it doesn't need to call // utext_map_offset_to_native. For some reason, uregex calls that function *a lot*, // literally half the CPU time is spent on it. cache.native_indexing_limit = ascii_len; cache.utf8_range = native_start..native_limit; Some(cache) } extern "C" fn utext_map_offset_to_native(ut: &icu_ffi::UText) -> i64 { debug_assert!((0..=ut.chunk_length).contains(&ut.chunk_offset)); let double_cache = double_cache_from_utext(ut); let cache = &double_cache.cache[double_cache.mru as usize]; let off_rel = cache.utf16_to_utf8_offsets[ut.chunk_offset as usize]; let off_abs = cache.utf8_range.start + off_rel as usize; off_abs as i64 } extern "C" fn utext_map_native_index_to_utf16(ut: &icu_ffi::UText, native_index: i64) -> i32 { debug_assert!((ut.chunk_native_start..=ut.chunk_native_limit).contains(&native_index)); let double_cache = double_cache_from_utext(ut); let cache = &double_cache.cache[double_cache.mru as usize]; let off_rel = cache.utf8_to_utf16_offsets[(native_index - ut.chunk_native_start) as usize]; off_rel as i32 } /// A wrapper around ICU's `URegularExpression` struct. /// /// # Safety /// /// Warning! No lifetime tracking is done here. pub struct Regex(&'static mut icu_ffi::URegularExpression); impl Drop for Regex { fn drop(&mut self) { let f = assume_loaded(); unsafe { (f.uregex_close)(self.0) }; } } impl Regex { /// Enable case-insensitive matching. pub const CASE_INSENSITIVE: i32 = icu_ffi::UREGEX_CASE_INSENSITIVE; /// If set, ^ and $ match the start and end of each line. /// Otherwise, they match the start and end of the entire string. pub const MULTILINE: i32 = icu_ffi::UREGEX_MULTILINE; /// Treat the given pattern as a literal string. pub const LITERAL: i32 = icu_ffi::UREGEX_LITERAL; /// Constructs a regex, plain and simple. Read `uregex_open` docs. /// /// # Safety /// /// The caller must ensure that the given `Text` outlives the returned `Regex` instance. pub unsafe fn new(pattern: &str, flags: i32, text: &Text) -> apperr::Result<Self> { let f = init_if_needed()?; unsafe { let scratch = scratch_arena(None); let mut utf16 = Vec::new_in(&*scratch); let mut status = icu_ffi::U_ZERO_ERROR; utf16.extend(pattern.encode_utf16()); let ptr = (f.uregex_open)( utf16.as_ptr(), utf16.len() as i32, icu_ffi::UREGEX_MULTILINE | icu_ffi::UREGEX_ERROR_ON_UNKNOWN_ESCAPES | flags, None, &mut status, ); // ICU describes the time unit as being dependent on CPU performance // and "typically [in] the order of milliseconds", but this claim seems // highly outdated. On my CPU from 2021, a limit of 4096 equals roughly 600ms. (f.uregex_setTimeLimit)(ptr, 4096, &mut status); (f.uregex_setUText)(ptr, text.0 as *const _ as *mut _, &mut status); if status.is_failure() { return Err(status.as_error()); } Ok(Self(&mut *ptr)) } } /// Updates the regex pattern with the given text. /// If the text contents have changed, you can pass the same text as you used /// initially and it'll trigger ICU to reload the text and invalidate its caches. /// /// # Safety /// /// The caller must ensure that the given `Text` outlives the `Regex` instance. pub unsafe fn set_text(&mut self, text: &mut Text, offset: usize) { // Get `utext_access_impl` to detect the `TextBuffer::generation` change, // and refresh its contents. This ensures that ICU doesn't reuse // stale `UText::chunk_contents`, as it has no way tell that it's stale. utext_access(text.0, offset as i64, true); let f = assume_loaded(); let mut status = icu_ffi::U_ZERO_ERROR; unsafe { (f.uregex_setUText)(self.0, text.0 as *const _ as *mut _, &mut status) }; // `uregex_setUText` resets the regex to the start of the text. // Because of this, we must also call `uregex_reset64`. unsafe { (f.uregex_reset64)(self.0, offset as i64, &mut status) }; } /// Sets the regex to the absolute offset in the underlying text. pub fn reset(&mut self, offset: usize) { let f = assume_loaded(); let mut status = icu_ffi::U_ZERO_ERROR; unsafe { (f.uregex_reset64)(self.0, offset as i64, &mut status) }; } } impl Iterator for Regex { type Item = Range<usize>; fn next(&mut self) -> Option<Self::Item> { let f = assume_loaded(); let mut status = icu_ffi::U_ZERO_ERROR; let ok = unsafe { (f.uregex_findNext)(self.0, &mut status) }; if !ok { return None; } let start = unsafe { (f.uregex_start64)(self.0, 0, &mut status) }; let end = unsafe { (f.uregex_end64)(self.0, 0, &mut status) }; if status.is_failure() { return None; } let start = start.max(0); let end = end.max(start); Some(start as usize..end as usize) } } static mut ROOT_COLLATOR: Option<*mut icu_ffi::UCollator> = None; /// Compares two UTF-8 strings for sorting using ICU's collation algorithm. pub fn compare_strings(a: &[u8], b: &[u8]) -> Ordering { // OnceCell for people that want to put it into a static. #[allow(static_mut_refs)] let coll = unsafe { if ROOT_COLLATOR.is_none() { ROOT_COLLATOR = Some(if let Ok(f) = init_if_needed() { let mut status = icu_ffi::U_ZERO_ERROR; (f.ucol_open)(c"".as_ptr(), &mut status) } else { null_mut() }); } ROOT_COLLATOR.unwrap_unchecked() }; if coll.is_null() { compare_strings_ascii(a, b) } else { let f = assume_loaded(); let mut status = icu_ffi::U_ZERO_ERROR; let res = unsafe { (f.ucol_strcollUTF8)( coll, a.as_ptr(), a.len() as i32, b.as_ptr(), b.len() as i32, &mut status, ) }; match res { icu_ffi::UCollationResult::UCOL_EQUAL => Ordering::Equal, icu_ffi::UCollationResult::UCOL_GREATER => Ordering::Greater, icu_ffi::UCollationResult::UCOL_LESS => Ordering::Less, } } } /// Unicode collation via `ucol_strcollUTF8`, now for ASCII! fn compare_strings_ascii(a: &[u8], b: &[u8]) -> Ordering { let mut iter = a.iter().zip(b.iter()); // Low weight: Find the first character which differs. // // Remember that result in case all remaining characters are // case-insensitive equal, because then we use that as a fallback. while let Some((&a, &b)) = iter.next() { if a != b { let mut order = a.cmp(&b); let la = a.to_ascii_lowercase(); let lb = b.to_ascii_lowercase(); if la == lb { // High weight: Find the first character which // differs case-insensitively. for (a, b) in iter { let la = a.to_ascii_lowercase(); let lb = b.to_ascii_lowercase(); if la != lb { order = la.cmp(&lb); break; } } } return order; } } // Fallback: The shorter string wins. a.len().cmp(&b.len()) } static mut ROOT_CASEMAP: Option<*mut icu_ffi::UCaseMap> = None; /// Converts the given UTF-8 string to lower case. /// /// Case folding differs from lower case in that the output is primarily useful /// to machines for comparisons. It's like applying Unicode normalization. pub fn fold_case<'a>(arena: &'a Arena, input: &str) -> ArenaString<'a> { // OnceCell for people that want to put it into a static. #[allow(static_mut_refs)] let casemap = unsafe { if ROOT_CASEMAP.is_none() { ROOT_CASEMAP = Some(if let Ok(f) = init_if_needed() { let mut status = icu_ffi::U_ZERO_ERROR; (f.ucasemap_open)(null(), 0, &mut status) } else { null_mut() }) } ROOT_CASEMAP.unwrap_unchecked() }; if !casemap.is_null() { let f = assume_loaded(); let mut status = icu_ffi::U_ZERO_ERROR; let mut output = Vec::new_in(arena); let mut output_len; // First, guess the output length: // TODO: What's a good heuristic here? { output.reserve_exact(input.len() + 16); let output = output.spare_capacity_mut(); output_len = unsafe { (f.ucasemap_utf8FoldCase)( casemap, output.as_mut_ptr() as *mut _, output.len() as i32, input.as_ptr() as *const _, input.len() as i32, &mut status, ) }; } // If that failed to fit, retry with the correct length. if status == icu_ffi::U_BUFFER_OVERFLOW_ERROR && output_len > 0 { output.reserve_exact(output_len as usize); let output = output.spare_capacity_mut(); output_len = unsafe { (f.ucasemap_utf8FoldCase)( casemap, output.as_mut_ptr() as *mut _, output.len() as i32, input.as_ptr() as *const _, input.len() as i32, &mut status, ) }; } if status.is_success() && output_len > 0 { unsafe { output.set_len(output_len as usize); } return unsafe { ArenaString::from_utf8_unchecked(output) }; } } let mut result = ArenaString::from_str(arena, input); for b in unsafe { result.as_bytes_mut() } { b.make_ascii_lowercase(); } result } // NOTE: // To keep this neat, fields are ordered by prefix (= `ucol_` before `uregex_`), // followed by functions in this order: // * Static methods (e.g. `ucnv_getAvailableName`) // * Constructors (e.g. `ucnv_open`) // * Destructors (e.g. `ucnv_close`) // * Methods, grouped by relationship // (e.g. `uregex_start64` and `uregex_end64` are near each other) // // WARNING: // The order of the fields MUST match the order of strings in the following two arrays. #[allow(non_snake_case)] #[repr(C)] struct LibraryFunctions { // LIBICUUC_PROC_NAMES u_errorName: icu_ffi::u_errorName, ucasemap_open: icu_ffi::ucasemap_open, ucasemap_utf8FoldCase: icu_ffi::ucasemap_utf8FoldCase, ucnv_getAvailableName: icu_ffi::ucnv_getAvailableName, ucnv_getStandardName: icu_ffi::ucnv_getStandardName, ucnv_open: icu_ffi::ucnv_open, ucnv_close: icu_ffi::ucnv_close, ucnv_convertEx: icu_ffi::ucnv_convertEx, utext_setup: icu_ffi::utext_setup, utext_close: icu_ffi::utext_close, // LIBICUI18N_PROC_NAMES ucol_open: icu_ffi::ucol_open, ucol_strcollUTF8: icu_ffi::ucol_strcollUTF8, uregex_open: icu_ffi::uregex_open, uregex_close: icu_ffi::uregex_close, uregex_setTimeLimit: icu_ffi::uregex_setTimeLimit, uregex_setUText: icu_ffi::uregex_setUText, uregex_reset64: icu_ffi::uregex_reset64, uregex_findNext: icu_ffi::uregex_findNext, uregex_start64: icu_ffi::uregex_start64, uregex_end64: icu_ffi::uregex_end64, } // Found in libicuuc.so on UNIX, icuuc.dll/icu.dll on Windows. const LIBICUUC_PROC_NAMES: [&CStr; 10] = [ c"u_errorName", c"ucasemap_open", c"ucasemap_utf8FoldCase", c"ucnv_getAvailableName", c"ucnv_getStandardName", c"ucnv_open", c"ucnv_close", c"ucnv_convertEx", c"utext_setup", c"utext_close", ]; // Found in libicui18n.so on UNIX, icuin.dll/icu.dll on Windows. const LIBICUI18N_PROC_NAMES: [&CStr; 10] = [ c"ucol_open", c"ucol_strcollUTF8", c"uregex_open", c"uregex_close", c"uregex_setTimeLimit", c"uregex_setUText", c"uregex_reset64", c"uregex_findNext", c"uregex_start64", c"uregex_end64", ]; enum LibraryFunctionsState { Uninitialized, Failed, Loaded(LibraryFunctions), } static mut LIBRARY_FUNCTIONS: LibraryFunctionsState = LibraryFunctionsState::Uninitialized; pub fn init() -> apperr::Result<()> { init_if_needed()?; Ok(()) } #[allow(static_mut_refs)] fn init_if_needed() -> apperr::Result<&'static LibraryFunctions> { #[cold] fn load() { unsafe { LIBRARY_FUNCTIONS = LibraryFunctionsState::Failed; let Ok(libicuuc) = sys::load_libicuuc() else { return; }; let Ok(libicui18n) = sys::load_libicui18n() else { return; }; type TransparentFunction = unsafe extern "C" fn() -> *const (); // OH NO I'M DOING A BAD THING // // If this assertion hits, you either forgot to update `LIBRARY_PROC_NAMES` // or you're on a platform where `dlsym` behaves different from classic UNIX and Windows. // // This code assumes that we can treat the `LibraryFunctions` struct containing various different function // pointers as an array of `TransparentFunction` pointers. In C, this works on any platform that supports // POSIX `dlsym` or equivalent, but I suspect Rust is once again being extra about it. In any case, that's // still better than loading every function one by one, just to blow up our binary size for no reason. const _: () = assert!( mem::size_of::<LibraryFunctions>() == mem::size_of::<TransparentFunction>() * (LIBICUUC_PROC_NAMES.len() + LIBICUI18N_PROC_NAMES.len()) ); let mut funcs = MaybeUninit::<LibraryFunctions>::uninit(); let mut ptr = funcs.as_mut_ptr() as *mut TransparentFunction; #[cfg(unix)] let scratch_outer = scratch_arena(None); #[cfg(unix)] let suffix = sys::icu_proc_suffix(&scratch_outer, libicuuc); for (handle, names) in [(libicuuc, &LIBICUUC_PROC_NAMES[..]), (libicui18n, &LIBICUI18N_PROC_NAMES[..])] { for name in names { #[cfg(unix)] let scratch = scratch_arena(Some(&scratch_outer)); #[cfg(unix)] let name = &sys::add_icu_proc_suffix(&scratch, name, &suffix); let Ok(func) = sys::get_proc_address(handle, name) else { debug_assert!( false, "Failed to load ICU function: {}", name.to_string_lossy() ); return; }; ptr.write(func); ptr = ptr.add(1); } } LIBRARY_FUNCTIONS = LibraryFunctionsState::Loaded(funcs.assume_init()); } } unsafe { if matches!(&LIBRARY_FUNCTIONS, LibraryFunctionsState::Uninitialized) { load(); } } match unsafe { &LIBRARY_FUNCTIONS } { LibraryFunctionsState::Loaded(f) => Ok(f), _ => Err(apperr::APP_ICU_MISSING), } } #[allow(static_mut_refs)] fn assume_loaded() -> &'static LibraryFunctions { match unsafe { &LIBRARY_FUNCTIONS } { LibraryFunctionsState::Loaded(f) => f, _ => unreachable!(), } } mod icu_ffi { #![allow(dead_code, non_camel_case_types)] use std::ffi::{c_char, c_int, c_void}; use crate::apperr; #[derive(Copy, Clone, Eq, PartialEq)] #[repr(transparent)] pub struct UErrorCode(c_int); impl UErrorCode { pub const fn new(code: u32) -> Self { Self(code as c_int) } pub fn is_success(&self) -> bool { self.0 <= 0 } pub fn is_failure(&self) -> bool { self.0 > 0 } pub fn as_error(&self) -> apperr::Error { debug_assert!(self.0 > 0); apperr::Error::new_icu(self.0 as u32) } } pub const U_ZERO_ERROR: UErrorCode = UErrorCode(0); pub const U_BUFFER_OVERFLOW_ERROR: UErrorCode = UErrorCode(15); pub const U_UNSUPPORTED_ERROR: UErrorCode = UErrorCode(16); pub type u_errorName = unsafe extern "C" fn(code: UErrorCode) -> *const c_char; pub struct UConverter; pub type ucnv_getAvailableName = unsafe extern "C" fn(n: i32) -> *const c_char; pub type ucnv_getStandardName = unsafe extern "C" fn( name: *const u8, standard: *const u8, status: &mut UErrorCode, ) -> *const c_char; pub type ucnv_open = unsafe extern "C" fn(converter_name: *const u8, status: &mut UErrorCode) -> *mut UConverter; pub type ucnv_close = unsafe extern "C" fn(converter: *mut UConverter); pub type ucnv_convertEx = unsafe extern "C" fn( target_cnv: *mut UConverter, source_cnv: *mut UConverter, target: *mut *mut u8, target_limit: *const u8, source: *mut *const u8, source_limit: *const u8, pivot_start: *mut u16, pivot_source: *mut *mut u16, pivot_target: *mut *mut u16, pivot_limit: *const u16, reset: bool, flush: bool, status: &mut UErrorCode, ); pub struct UCaseMap; pub type ucasemap_open = unsafe extern "C" fn( locale: *const c_char, options: u32, status: &mut UErrorCode, ) -> *mut UCaseMap; pub type ucasemap_utf8FoldCase = unsafe extern "C" fn( csm: *const UCaseMap, dest: *mut c_char, dest_capacity: i32, src: *const c_char, src_length: i32, status: &mut UErrorCode, ) -> i32; #[repr(C)] pub enum UCollationResult { UCOL_EQUAL = 0, UCOL_GREATER = 1, UCOL_LESS = -1, } #[repr(C)] pub struct UCollator; pub type ucol_open = unsafe extern "C" fn(loc: *const c_char, status: &mut UErrorCode) -> *mut UCollator; pub type ucol_strcollUTF8 = unsafe extern "C" fn( coll: *mut UCollator, source: *const u8, source_length: i32, target: *const u8, target_length: i32, status: &mut UErrorCode, ) -> UCollationResult; // UText callback functions pub type UTextClone = unsafe extern "C" fn( dest: *mut UText, src: &UText, deep: bool, status: &mut UErrorCode, ) -> *mut UText; pub type UTextNativeLength = unsafe extern "C" fn(ut: &mut UText) -> i64; pub type UTextAccess = unsafe extern "C" fn(ut: &mut UText, native_index: i64, forward: bool) -> bool; pub type UTextExtract = unsafe extern "C" fn( ut: &mut UText, native_start: i64, native_limit: i64, dest: *mut u16, dest_capacity: i32, status: &mut UErrorCode, ) -> i32; pub type UTextReplace = unsafe extern "C" fn( ut: &mut UText, native_start: i64, native_limit: i64, replacement_text: *const u16, replacement_length: i32, status: &mut UErrorCode, ) -> i32; pub type UTextCopy = unsafe extern "C" fn( ut: &mut UText, native_start: i64, native_limit: i64, native_dest: i64, move_text: bool, status: &mut UErrorCode, ); pub type UTextMapOffsetToNative = unsafe extern "C" fn(ut: &UText) -> i64; pub type UTextMapNativeIndexToUTF16 = unsafe extern "C" fn(ut: &UText, native_index: i64) -> i32; pub type UTextClose = unsafe extern "C" fn(ut: &mut UText); #[repr(C)] pub struct UTextFuncs { pub table_size: i32, pub reserved1: i32, pub reserved2: i32, pub reserved3: i32, pub clone: Option<UTextClone>, pub native_length: Option<UTextNativeLength>, pub access: Option<UTextAccess>, pub extract: Option<UTextExtract>, pub replace: Option<UTextReplace>, pub copy: Option<UTextCopy>, pub map_offset_to_native: Option<UTextMapOffsetToNative>, pub map_native_index_to_utf16: Option<UTextMapNativeIndexToUTF16>, pub close: Option<UTextClose>, pub spare1: Option<UTextClose>, pub spare2: Option<UTextClose>, pub spare3: Option<UTextClose>, } #[repr(C)] pub struct UText { pub magic: u32, pub flags: i32, pub provider_properties: i32, pub size_of_struct: i32, pub chunk_native_limit: i64, pub extra_size: i32, pub native_indexing_limit: i32, pub chunk_native_start: i64, pub chunk_offset: i32, pub chunk_length: i32, pub chunk_contents: *const u16, pub p_funcs: &'static UTextFuncs, pub p_extra: *mut c_void, pub context: *mut c_void, pub p: *mut c_void, pub q: *mut c_void, pub r: *mut c_void, pub priv_p: *mut c_void, pub a: i64, pub b: i32, pub c: i32, pub priv_a: i64, pub priv_b: i32, pub priv_c: i32, } pub const UTEXT_MAGIC: u32 = 0x345ad82c; pub const UTEXT_PROVIDER_LENGTH_IS_EXPENSIVE: i32 = 1; pub const UTEXT_PROVIDER_STABLE_CHUNKS: i32 = 2; pub const UTEXT_PROVIDER_WRITABLE: i32 = 3; pub const UTEXT_PROVIDER_HAS_META_DATA: i32 = 4; pub const UTEXT_PROVIDER_OWNS_TEXT: i32 = 5; pub type utext_setup = unsafe extern "C" fn( ut: *mut UText, extra_space: i32, status: &mut UErrorCode, ) -> *mut UText; pub type utext_close = unsafe extern "C" fn(ut: *mut UText) -> *mut UText; #[repr(C)] pub struct UParseError { pub line: i32, pub offset: i32, pub pre_context: [u16; 16], pub post_context: [u16; 16], } #[repr(C)] pub struct URegularExpression; pub const UREGEX_UNIX_LINES: i32 = 1; pub const UREGEX_CASE_INSENSITIVE: i32 = 2; pub const UREGEX_COMMENTS: i32 = 4; pub const UREGEX_MULTILINE: i32 = 8; pub const UREGEX_LITERAL: i32 = 16; pub const UREGEX_DOTALL: i32 = 32; pub const UREGEX_UWORD: i32 = 256; pub const UREGEX_ERROR_ON_UNKNOWN_ESCAPES: i32 = 512; pub type uregex_open = unsafe extern "C" fn( pattern: *const u16, pattern_length: i32, flags: i32, pe: Option<&mut UParseError>, status: &mut UErrorCode, ) -> *mut URegularExpression; pub type uregex_close = unsafe extern "C" fn(regexp: *mut URegularExpression); pub type uregex_setTimeLimit = unsafe extern "C" fn(regexp: *mut URegularExpression, limit: i32, status: &mut UErrorCode); pub type uregex_setUText = unsafe extern "C" fn( regexp: *mut URegularExpression, text: *mut UText, status: &mut UErrorCode, ); pub type uregex_reset64 = unsafe extern "C" fn(regexp: *mut URegularExpression, index: i64, status: &mut UErrorCode); pub type uregex_findNext = unsafe extern "C" fn(regexp: *mut URegularExpression, status: &mut UErrorCode) -> bool; pub type uregex_start64 = unsafe extern "C" fn( regexp: *mut URegularExpression, group_num: i32, status: &mut UErrorCode, ) -> i64; pub type uregex_end64 = unsafe extern "C" fn( regexp: *mut URegularExpression, group_num: i32, status: &mut UErrorCode, ) -> i64; } #[cfg(test)] mod tests { use super::*; #[test] fn test_compare_strings_ascii() { // Empty strings assert_eq!(compare_strings_ascii(b"", b""), Ordering::Equal); // Equal strings assert_eq!(compare_strings_ascii(b"hello", b"hello"), Ordering::Equal); // Different lengths assert_eq!(compare_strings_ascii(b"abc", b"abcd"), Ordering::Less); assert_eq!(compare_strings_ascii(b"abcd", b"abc"), Ordering::Greater); // Same chars, different cases - 1st char wins assert_eq!(compare_strings_ascii(b"AbC", b"aBc"), Ordering::Less); // Different chars, different cases - 2nd char wins, because it differs assert_eq!(compare_strings_ascii(b"hallo", b"Hello"), Ordering::Less); assert_eq!(compare_strings_ascii(b"Hello", b"hallo"), Ordering::Greater); } }

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/mixelpixx/microsoft-edit-mcp'

If you have feedback or need assistance with the MCP directory API, please join our Discord server